// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef PPL3RISCVKERNEL_SRC_FP16_GEMM_COMMON_RVV_1_0_CTO8C_KERNEL_H_
#define PPL3RISCVKERNEL_SRC_FP16_GEMM_COMMON_RVV_1_0_CTO8C_KERNEL_H_

namespace ppl { namespace kernel { namespace riscv {

template <int64_t atom_n>
static void conv_gemm_cto8c_m8nx_kernel_core_fp16(
    const __fp16* A,
    const __fp16* B,
    __fp16* C,
    int64_t k,
    int64_t total_n)
{
    asm volatile(
        ".equ            ATOM_N, %c[ATOM_N]         \n\t"

        "addi            s3, zero, 8                \n\t"
        "vsetvli         s2, s3, e16                \n\t"

        "mv              s2, %[A_LOC]               \n\t"
        "mv              s3, %[B_LOC]               \n\t"
        "mv              s4, %[C_LOC]               \n\t"
        "mv              s5, %[K]                   \n\t"
        "mv              s6, %[B_STRIDE]            \n\t"
        "addi            s7, zero, 4                \n\t"

        "0:                                             \n\t" // init
        "vle.v           v0, (s2)                   \n\t"
        "addi            s2, s2, 16                 \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vle.v           v1, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 0                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vle.v           v2, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 8                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vle.v           v3, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 24                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 16                        \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        "addi            s5, s5, -1                 \n\t"

        ".if ATOM_N > 0                             \n\t"
        "vrgather.vi     v4, v1, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vrgather.vi     v5, v1, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vfmul.vv        v8, v0, v4                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vrgather.vi     v6, v1, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vfmul.vv        v9, v0, v5                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vrgather.vi     v7, v1, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vfmul.vv        v10, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vfmul.vv        v11, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vrgather.vi     v4, v1, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vrgather.vi     v5, v1, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vfmul.vv        v12, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vrgather.vi     v6, v1, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vfmul.vv        v13, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vrgather.vi     v7, v1, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vfmul.vv        v14, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vfmul.vv        v15, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vrgather.vi     v4, v2, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vrgather.vi     v5, v2, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vfmul.vv        v16, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vrgather.vi     v6, v2, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vfmul.vv        v17, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vrgather.vi     v7, v2, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vfmul.vv        v18, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vfmul.vv        v19, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vrgather.vi     v4, v2, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vrgather.vi     v5, v2, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vfmul.vv        v20, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vrgather.vi     v6, v2, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vfmul.vv        v21, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vrgather.vi     v7, v2, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vfmul.vv        v22, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vfmul.vv        v23, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vrgather.vi     v4, v3, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vrgather.vi     v5, v3, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vfmul.vv        v24, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vrgather.vi     v6, v3, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vfmul.vv        v25, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vrgather.vi     v7, v3, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vfmul.vv        v26, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vfmul.vv        v27, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vrgather.vi     v4, v3, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vrgather.vi     v5, v3, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vfmul.vv        v28, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vrgather.vi     v6, v3, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vfmul.vv        v29, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vrgather.vi     v7, v3, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vfmul.vv        v30, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vfmul.vv        v31, v0, v7                \n\t"
        ".endif                                     \n\t"

        "beq             s5, zero, 3f               \n\t"
        "blt             s5, s7, 2f                 \n\t"

        "1:                                             \n\t" // loop k
        "addi            s5, s5, -4                 \n\t"
        "vle.v           v0, (s2)                   \n\t"
        "addi            s2, s2, 16                 \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vle.v           v1, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 0                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vle.v           v2, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 8                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vle.v           v3, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 24                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 16                        \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"

        ".if ATOM_N > 0                             \n\t"
        "vrgather.vi     v4, v1, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vrgather.vi     v5, v1, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vfmacc.vv       v8, v0, v4                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vrgather.vi     v6, v1, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vfmacc.vv       v9, v0, v5                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vrgather.vi     v7, v1, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vfmacc.vv       v10, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vfmacc.vv       v11, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vrgather.vi     v4, v1, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vrgather.vi     v5, v1, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vfmacc.vv       v12, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vrgather.vi     v6, v1, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vfmacc.vv       v13, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vrgather.vi     v7, v1, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vfmacc.vv       v14, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vfmacc.vv       v15, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vrgather.vi     v4, v2, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vrgather.vi     v5, v2, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vfmacc.vv       v16, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vrgather.vi     v6, v2, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vfmacc.vv       v17, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vrgather.vi     v7, v2, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vfmacc.vv       v18, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vfmacc.vv       v19, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vrgather.vi     v4, v2, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vrgather.vi     v5, v2, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vfmacc.vv       v20, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vrgather.vi     v6, v2, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vfmacc.vv       v21, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vrgather.vi     v7, v2, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vfmacc.vv       v22, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vfmacc.vv       v23, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vrgather.vi     v4, v3, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vrgather.vi     v5, v3, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vfmacc.vv       v24, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vrgather.vi     v6, v3, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vfmacc.vv       v25, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vrgather.vi     v7, v3, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vfmacc.vv       v26, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vfmacc.vv       v27, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vrgather.vi     v4, v3, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vrgather.vi     v5, v3, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vfmacc.vv       v28, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vrgather.vi     v6, v3, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vfmacc.vv       v29, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vrgather.vi     v7, v3, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vfmacc.vv       v30, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vfmacc.vv       v31, v0, v7                \n\t"
        ".endif                                     \n\t"

        "vle.v           v0, (s2)                   \n\t"
        "addi            s2, s2, 16                 \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vle.v           v1, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 0                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vle.v           v2, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 8                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vle.v           v3, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 24                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 16                        \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"

        ".if ATOM_N > 0                             \n\t"
        "vrgather.vi     v4, v1, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vrgather.vi     v5, v1, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vfmacc.vv       v8, v0, v4                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vrgather.vi     v6, v1, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vfmacc.vv       v9, v0, v5                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vrgather.vi     v7, v1, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vfmacc.vv       v10, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vfmacc.vv       v11, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vrgather.vi     v4, v1, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vrgather.vi     v5, v1, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vfmacc.vv       v12, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vrgather.vi     v6, v1, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vfmacc.vv       v13, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vrgather.vi     v7, v1, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vfmacc.vv       v14, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vfmacc.vv       v15, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vrgather.vi     v4, v2, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vrgather.vi     v5, v2, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vfmacc.vv       v16, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vrgather.vi     v6, v2, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vfmacc.vv       v17, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vrgather.vi     v7, v2, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vfmacc.vv       v18, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vfmacc.vv       v19, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vrgather.vi     v4, v2, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vrgather.vi     v5, v2, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vfmacc.vv       v20, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vrgather.vi     v6, v2, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vfmacc.vv       v21, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vrgather.vi     v7, v2, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vfmacc.vv       v22, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vfmacc.vv       v23, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vrgather.vi     v4, v3, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vrgather.vi     v5, v3, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vfmacc.vv       v24, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vrgather.vi     v6, v3, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vfmacc.vv       v25, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vrgather.vi     v7, v3, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vfmacc.vv       v26, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vfmacc.vv       v27, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vrgather.vi     v4, v3, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vrgather.vi     v5, v3, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vfmacc.vv       v28, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vrgather.vi     v6, v3, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vfmacc.vv       v29, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vrgather.vi     v7, v3, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vfmacc.vv       v30, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vfmacc.vv       v31, v0, v7                \n\t"
        ".endif                                     \n\t"

        "vle.v           v0, (s2)                   \n\t"
        "addi            s2, s2, 16                 \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vle.v           v1, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 0                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vle.v           v2, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 8                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vle.v           v3, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 24                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 16                        \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"

        ".if ATOM_N > 0                             \n\t"
        "vrgather.vi     v4, v1, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vrgather.vi     v5, v1, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vfmacc.vv       v8, v0, v4                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vrgather.vi     v6, v1, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vfmacc.vv       v9, v0, v5                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vrgather.vi     v7, v1, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vfmacc.vv       v10, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vfmacc.vv       v11, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vrgather.vi     v4, v1, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vrgather.vi     v5, v1, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vfmacc.vv       v12, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vrgather.vi     v6, v1, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vfmacc.vv       v13, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vrgather.vi     v7, v1, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vfmacc.vv       v14, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vfmacc.vv       v15, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vrgather.vi     v4, v2, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vrgather.vi     v5, v2, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vfmacc.vv       v16, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vrgather.vi     v6, v2, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vfmacc.vv       v17, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vrgather.vi     v7, v2, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vfmacc.vv       v18, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vfmacc.vv       v19, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vrgather.vi     v4, v2, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vrgather.vi     v5, v2, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vfmacc.vv       v20, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vrgather.vi     v6, v2, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vfmacc.vv       v21, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vrgather.vi     v7, v2, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vfmacc.vv       v22, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vfmacc.vv       v23, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vrgather.vi     v4, v3, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vrgather.vi     v5, v3, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vfmacc.vv       v24, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vrgather.vi     v6, v3, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vfmacc.vv       v25, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vrgather.vi     v7, v3, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vfmacc.vv       v26, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vfmacc.vv       v27, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vrgather.vi     v4, v3, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vrgather.vi     v5, v3, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vfmacc.vv       v28, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vrgather.vi     v6, v3, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vfmacc.vv       v29, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vrgather.vi     v7, v3, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vfmacc.vv       v30, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vfmacc.vv       v31, v0, v7                \n\t"
        ".endif                                     \n\t"

        "vle.v           v0, (s2)                   \n\t"
        "addi            s2, s2, 16                 \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vle.v           v1, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 0                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vle.v           v2, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 8                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vle.v           v3, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 24                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 16                        \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"

        ".if ATOM_N > 0                             \n\t"
        "vrgather.vi     v4, v1, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vrgather.vi     v5, v1, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vfmacc.vv       v8, v0, v4                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vrgather.vi     v6, v1, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vfmacc.vv       v9, v0, v5                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vrgather.vi     v7, v1, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vfmacc.vv       v10, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vfmacc.vv       v11, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vrgather.vi     v4, v1, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vrgather.vi     v5, v1, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vfmacc.vv       v12, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vrgather.vi     v6, v1, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vfmacc.vv       v13, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vrgather.vi     v7, v1, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vfmacc.vv       v14, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vfmacc.vv       v15, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vrgather.vi     v4, v2, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vrgather.vi     v5, v2, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vfmacc.vv       v16, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vrgather.vi     v6, v2, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vfmacc.vv       v17, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vrgather.vi     v7, v2, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vfmacc.vv       v18, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vfmacc.vv       v19, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vrgather.vi     v4, v2, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vrgather.vi     v5, v2, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vfmacc.vv       v20, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vrgather.vi     v6, v2, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vfmacc.vv       v21, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vrgather.vi     v7, v2, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vfmacc.vv       v22, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vfmacc.vv       v23, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vrgather.vi     v4, v3, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vrgather.vi     v5, v3, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vfmacc.vv       v24, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vrgather.vi     v6, v3, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vfmacc.vv       v25, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vrgather.vi     v7, v3, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vfmacc.vv       v26, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vfmacc.vv       v27, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vrgather.vi     v4, v3, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vrgather.vi     v5, v3, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vfmacc.vv       v28, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vrgather.vi     v6, v3, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vfmacc.vv       v29, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vrgather.vi     v7, v3, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vfmacc.vv       v30, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vfmacc.vv       v31, v0, v7                \n\t"
        ".endif                                     \n\t"

        "bge             s5, s7, 1b                 \n\t"
        "beq             s5, zero, 3f               \n\t"

        "2:                                             \n\t" // k left
        "vle.v           v0, (s2)                   \n\t"
        "addi            s2, s2, 16                 \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vle.v           v1, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 0                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vle.v           v2, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 8                         \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vle.v           v3, (s3)                   \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 24                            \n\t"
        "addi            s3, s3, 16                 \n\t"
        ".elseif ATOM_N > 16                        \n\t"
        "add             s3, s3, s6                 \n\t"
        ".endif                                     \n\t"
        "addi            s5, s5, -1                 \n\t"

        ".if ATOM_N > 0                             \n\t"
        "vrgather.vi     v4, v1, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vrgather.vi     v5, v1, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 0                             \n\t"
        "vfmacc.vv       v8, v0, v4                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vrgather.vi     v6, v1, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vfmacc.vv       v9, v0, v5                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vrgather.vi     v7, v1, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vfmacc.vv       v10, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vfmacc.vv       v11, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vrgather.vi     v4, v1, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vrgather.vi     v5, v1, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vfmacc.vv       v12, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vrgather.vi     v6, v1, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vfmacc.vv       v13, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vrgather.vi     v7, v1, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vfmacc.vv       v14, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vfmacc.vv       v15, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vrgather.vi     v4, v2, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vrgather.vi     v5, v2, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vfmacc.vv       v16, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vrgather.vi     v6, v2, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vfmacc.vv       v17, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vrgather.vi     v7, v2, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vfmacc.vv       v18, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vfmacc.vv       v19, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vrgather.vi     v4, v2, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vrgather.vi     v5, v2, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vfmacc.vv       v20, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vrgather.vi     v6, v2, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vfmacc.vv       v21, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vrgather.vi     v7, v2, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vfmacc.vv       v22, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vfmacc.vv       v23, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vrgather.vi     v4, v3, 0                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vrgather.vi     v5, v3, 1                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vfmacc.vv       v24, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vrgather.vi     v6, v3, 2                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vfmacc.vv       v25, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vrgather.vi     v7, v3, 3                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vfmacc.vv       v26, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vfmacc.vv       v27, v0, v7                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vrgather.vi     v4, v3, 4                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vrgather.vi     v5, v3, 5                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vfmacc.vv       v28, v0, v4                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vrgather.vi     v6, v3, 6                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vfmacc.vv       v29, v0, v5                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vrgather.vi     v7, v3, 7                  \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vfmacc.vv       v30, v0, v6                \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vfmacc.vv       v31, v0, v7                \n\t"
        ".endif                                     \n\t"

        "bnez            s5, 2b                     \n\t"

        "3:                                             \n\t" // end
        ".if ATOM_N > 0                             \n\t"
        "vse.v           v8, (s4)                   \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 1                             \n\t"
        "vse.v           v9, (s4)                   \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 2                             \n\t"
        "vse.v           v10, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 3                             \n\t"
        "vse.v           v11, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 4                             \n\t"
        "vse.v           v12, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 5                             \n\t"
        "vse.v           v13, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 6                             \n\t"
        "vse.v           v14, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 7                             \n\t"
        "vse.v           v15, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 8                             \n\t"
        "vse.v           v16, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 9                             \n\t"
        "vse.v           v17, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 10                            \n\t"
        "vse.v           v18, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 11                            \n\t"
        "vse.v           v19, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 12                            \n\t"
        "vse.v           v20, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 13                            \n\t"
        "vse.v           v21, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 14                            \n\t"
        "vse.v           v22, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 15                            \n\t"
        "vse.v           v23, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 16                            \n\t"
        "vse.v           v24, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 17                            \n\t"
        "vse.v           v25, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 18                            \n\t"
        "vse.v           v26, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 19                            \n\t"
        "vse.v           v27, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 20                            \n\t"
        "vse.v           v28, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 21                            \n\t"
        "vse.v           v29, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 22                            \n\t"
        "vse.v           v30, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"
        ".if ATOM_N > 23                            \n\t"
        "vse.v           v31, (s4)                  \n\t"
        "addi            s4, s4, 16                 \n\t"
        ".endif                                     \n\t"

        :
        : [ATOM_N] "i"(atom_n), [A_LOC] "r"(A), [B_LOC] "r"(B), [C_LOC] "r"(C), [K] "r"(k), [B_STRIDE] "r"(total_n * 2 - (atom_n - 1) / 8 * 8 * 2)

        : "memory", "s2", "s3", "s4", "s5", "s6", "s7", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}

}}}; // namespace ppl::kernel::riscv

#endif
